Data will be updated weekly

Each new record is accumulated data from previous days

This is the dataset that describes Equipment Losses & Death Toll & Military Wounded & Prisoner of War of russians in 2022 Ukraine russia War.

Data:

russia_losses_equipment.csv - contains Equipment Losses during the war

Data Sources:

Main data sources are General Staff of the Armed Forces of Ukraine and Ministry of Defence of Ukraine. Data from different battlefields are gathered. The calculation is complicated by the high intensity of hostilities. Data are being updated.   https://www.kaggle.com/datasets/piterfm/2022-ukraine-russian-war
Tracking:
Aircraft
Helicopter
Tank
Armored Personnel Carrier (APC)
Multiple Rocket Launcher (MRL)
Field Artillery
Military Auto - has not been tracked since 2022-05-01; joined with Fuel Tank into Vehicles and Fuel Tanks
Fuel Tank - has not been tracked since 2022-05-01; joined with Military Auto into Vehicles and Fuel Tanks
Anti-aircraft warfare
Drone - UAV+RPA
Naval Ship - Warships, Boats
Anti-aircraft Warfare
Mobile SRBM System - has not been tracked since 2022-05-01; joined into Cruise Missiles
Vehicles and Fuel Tanks - appear since 2022-05-01 as a sum of Fuel Tank and Military Auto
Cruise Missiles - appear since 2022-05-01
Direction of Greatest Losses - appear since 2022-04-25

Acronyms:
MRL - Multiple Rocket Launcher,
APC - Armored Personnel Carrier,
SRBM - Short-range ballistic missile,
UAV - Unmanned Aerial Vehicle,
RPA - Remotely Piloted Vehicle.

Dataset History:

2022-03-02 - dataset is created (after 7 days of the War).

library(TSstudio)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(tseries)
library(fUnitRoots)
library(pdR)
library(uroot)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ MASS::select()  masks dplyr::select()
## ✖ readr::spec()   masks TSA::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tibbletime)
## 
## Attaching package: 'tibbletime'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(anomalize)
library(pracma)
## 
## Attaching package: 'pracma'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(prophet)
## Zorunlu paket yükleniyor: Rcpp
## Zorunlu paket yükleniyor: rlang
## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
data=read.csv("C:/Users/ASUS/Desktop/time series project/2022 Russia Ukraine War.csv")
data1=data
data=data[,c("date","tank")]
data["date"]=as.Date(data$date,format="%Y-%m-%d")
data
data2<- data %>% dplyr::mutate(year = lubridate::year(date), month = lubridate::month(date))
data2$month<-as.factor(data2$month)
data2$year<-as.factor(data2$year)
data2
data3=read.zoo(data)
bp <- ggplot(data2, aes(x=month, y=tank, fill=month)) + 
  geom_boxplot()+
  labs(title="Boxplot Across Months",x="Month", y = "tank")
bp

bp <- ggplot(data2, aes(x=year, y=tank, fill=year)) + 
  geom_boxplot()+
  labs(title="Boxplot Across Years",x="year", y = "tank")
bp

ggplot(data2, aes(as.numeric(as.character(month)), tank)) + geom_line(aes(colour = year))+xlab("month")

acf(data3)

pacf(data3)

data
head(data3)
## 2022-02-25 2022-02-26 2022-02-27 2022-02-28 2022-03-01 2022-03-02 
##         80        146        150        150        198        211
str(data3)
## 'zoo' series from 2022-02-25 to 2023-11-12
##   Data: int [1:626] 80 146 150 150 198 211 217 251 269 285 ...
##   Index:  Date[1:626], format: "2022-02-25" "2022-02-26" "2022-02-27" "2022-02-28" "2022-03-01" ...
summary(data3)
##      Index                data3     
##  Min.   :2022-02-25   Min.   :  80  
##  1st Qu.:2022-07-31   1st Qu.:1764  
##  Median :2023-01-03   Median :3037  
##  Mean   :2023-01-03   Mean   :2888  
##  3rd Qu.:2023-06-08   3rd Qu.:3898  
##  Max.   :2023-11-12   Max.   :5349
autoplot(data3)

The plot shows us the data has trend and no seasoanlity. Hence, the process is not stationary.

data
split=which(data$date=="2023-10-12")
train=data[0:split,]
test=data[(split+1):length(data$date),]
test2=test
traindf <- as_tbl_time(train,index = date)
class(traindf)
## [1] "tbl_time"   "tbl_df"     "tbl"        "data.frame"
traindf %>% 
  time_decompose(tank, method = "stl", trend = "auto") %>%
  anomalize(remainder, method = "gesd", alpha = 0.5) %>%
  plot_anomaly_decomposition()
## frequency = 7 days
## trend = 91 days

traindf %>% 
  time_decompose(tank) %>%
  anomalize(remainder, method = "gesd", alpha = 0.5) %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
## frequency = 7 days
## trend = 91 days

traindf=traindf %>% 
  time_decompose(tank, method = "stl", trend = "auto") %>%
  anomalize(remainder, method = "gesd", alpha = 0.5) %>%
  filter(anomaly == 'No')  
## frequency = 7 days
## trend = 91 days
train=merge(train["date"],traindf[,c("date","observed")],by="date",all.x = TRUE)
train["observed"]=tsclean(train$observed)
train["observed"]=as.numeric(train$observed)
colnames(train)=c("date","tank")

train=as_tbl_time(train,index=date)
test=as_tbl_time(test,index=date)
train2=train
train=read.zoo(train)
ts_plot(train)
b=boxcox(lm(train ~ 1))

b$x[which.max(b$y)]
## [1] 1.030303

Since lambda is close to 1, there is no need to transform data.

acf(train)

The ACF plot shows us there exists deterministic trend.

pacf(train)

Since the ACF plot shows us that the process is not stationary, no need to look the PACF plot.

kpss.test(train, null="Level")
## Warning in kpss.test(train, null = "Level"): p-value smaller than printed
## p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train
## KPSS Level = 8.4175, Truncation lag parameter = 6, p-value = 0.01

Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the dataset is not stationary.

kpss.test(train, null="Trend")
## Warning in kpss.test(train, null = "Trend"): p-value smaller than printed
## p-value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  train
## KPSS Trend = 1.8388, Truncation lag parameter = 6, p-value = 0.01

Since p-value is less than 0.05, the trend is stochastic.

mean(train)
## [1] 2766.877

Since the mean of the time series is not zero or close to zero, we need to apply ADF test.

adfTest(train, lags=1, type="c") 
## Warning in adfTest(train, lags = 1, type = "c"): p-value smaller than printed
## p-value
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 1
##   STATISTIC:
##     Dickey-Fuller: -5.7602
##   P VALUE:
##     0.01 
## 
## Description:
##  Sun Jan 21 20:09:15 2024 by user: ASUS

Since p-value is less than 0.05, the process has no unit root.

There is contradictory between kpss test and adf test so we need to check plots.
Since ACF plot has linear decay, we need to differentiate the data.

pp.test(train)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  train
## Dickey-Fuller Z(alpha) = -4.8213, Truncation lag parameter = 6, p-value
## = 0.8408
## alternative hypothesis: stationary

There is contradictory between kpss test & pp-test and adf test so we need to check plots.
Since ACF plot has linear decay, we need to differentiate the data.

difftank=diff(train)
acf(difftank)

pacf(difftank)

According to the ACF plot, the process still is not stationary.

kpss.test(difftank, null="Level")
## Warning in kpss.test(difftank, null = "Level"): p-value smaller than printed
## p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  difftank
## KPSS Level = 2.0185, Truncation lag parameter = 6, p-value = 0.01

Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the differenced dataset is not stationary.

mean(difftank)
## [1] 7.883838

Since the mean of the time series is not zero or close to zero, we need to apply ADF test.

adfTest(difftank, lags=5, type="c") 
## Warning in adfTest(difftank, lags = 5, type = "c"): p-value smaller than
## printed p-value
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 5
##   STATISTIC:
##     Dickey-Fuller: -5.5029
##   P VALUE:
##     0.01 
## 
## Description:
##  Sun Jan 21 20:09:15 2024 by user: ASUS

Since p-value is smaller than 0.05, the differenced process has no unit root.

Hence, we need to differentiate the differenced process.

Hence, we need to differentiate the process.

pp.test(difftank)
## Warning in pp.test(difftank): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  difftank
## Dickey-Fuller Z(alpha) = -554.04, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
diff2tank=diff(difftank)
acf(diff2tank)

pacf(diff2tank)

kpss.test(diff2tank, null="Level")
## Warning in kpss.test(diff2tank, null = "Level"): p-value greater than printed
## p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff2tank
## KPSS Level = 0.020304, Truncation lag parameter = 6, p-value = 0.1

Since the p-value is greater than 0.05, we fail to reject the null hypothesis and conclude that the differenced dataset is stationary.

mean(diff2tank)
## [1] 0

Since the mean of the time series is zero, we need to apply pp test.

adfTest(diff2tank,lags =4,type = "nc" )
## Warning in adfTest(diff2tank, lags = 4, type = "nc"): p-value smaller than
## printed p-value
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 4
##   STATISTIC:
##     Dickey-Fuller: -16.7239
##   P VALUE:
##     0.01 
## 
## Description:
##  Sun Jan 21 20:09:16 2024 by user: ASUS
pp.test(diff2tank)
## Warning in pp.test(diff2tank): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff2tank
## Dickey-Fuller Z(alpha) = -738.73, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary

Since p-value is less than 0.05, the differenced process has no unit root.
Hence, for model, d=2 (I(2)).

acf(diff2tank)

MA(4) or MA(1)

pacf(diff2tank)

AR(4)

eacf(diff2tank)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x o o o x o o o o o o  o  o  o 
## 1 x x o o x o o o o o o  o  o  o 
## 2 x x x o x o o o o o o  o  o  o 
## 3 x x x x o o o o o o o  o  o  o 
## 4 x x o x x o o o o o o  o  o  o 
## 5 x x o x o o o o o o o  o  o  o 
## 6 x x x x o o o o o o o  o  o  o 
## 7 x x o x o o o x o o o  o  o  o

According to ACF, PACF, and EACf, the possible models are ARIMA(4,2,1) and ARIMA(4,2,4).

auto.arima(train)
## Series: train 
## ARIMA(2,2,2) 
## 
## Coefficients:
##          ar1     ar2      ma1     ma2
##       0.7691  0.0390  -1.5921  0.6035
## s.e.  0.1523  0.0584   0.1478  0.1388
## 
## sigma^2 = 26.97:  log likelihood = -1817.17
## AIC=3644.35   AICc=3644.45   BIC=3666.28
checkFit=Arima(train,order = c(4,2,1))
checkFit
## Series: train 
## ARIMA(4,2,1) 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4      ma1
##       0.1364  0.1347  0.0266  0.0509  -0.9516
## s.e.  0.0467  0.0457  0.0450  0.0451   0.0208
## 
## sigma^2 = 27.09:  log likelihood = -1817.94
## AIC=3647.88   AICc=3648.02   BIC=3674.19

Since MA(1) is not 1 or close to 1, we did not get much difference.

ar4=0.0509/0.0451
ma1=abs(-0.9516/0.0208)
ar4
## [1] 1.128603
ma1
## [1] 45.75
fit=Arima(train,order = c(3,2,1))
fit
## Series: train 
## ARIMA(3,2,1) 
## 
## Coefficients:
##          ar1     ar2     ar3      ma1
##       0.1276  0.1319  0.0243  -0.9419
## s.e.  0.0470  0.0463  0.0455   0.0217
## 
## sigma^2 = 27.1:  log likelihood = -1818.58
## AIC=3647.15   AICc=3647.26   BIC=3669.08
ar3=0.0243/0.0455
ma1=abs(-0.9419/0.0217)
ar3
## [1] 0.5340659
ma1
## [1] 43.40553
fit1=Arima(train,order = c(2,2,1))
fit1
## Series: train 
## ARIMA(2,2,1) 
## 
## Coefficients:
##          ar1     ar2      ma1
##       0.1252  0.1297  -0.9370
## s.e.  0.0474  0.0466   0.0212
## 
## sigma^2 = 27.07:  log likelihood = -1818.72
## AIC=3645.44   AICc=3645.51   BIC=3662.98
ar2=0.1297/0.0466
ma1=abs(-0.9370/0.0212)
ar2
## [1] 2.783262
ma1
## [1] 44.19811

Since the parameters are significant, fit1 can be used.

fit2=Arima(train,order = c(4,2,4))
fit2
## Series: train 
## ARIMA(4,2,4) 
## 
## Coefficients:
##           ar1      ar2     ar3     ar4      ma1     ma2      ma3     ma4
##       -0.0955  -0.2698  0.7856  0.0431  -0.7213  0.2082  -1.0641  0.6089
## s.e.   0.1525   0.0989  0.1168  0.0582   0.1482  0.0158   0.0292  0.1395
## 
## sigma^2 = 26.78:  log likelihood = -1814.63
## AIC=3647.26   AICc=3647.56   BIC=3686.72
ar4=0.0431/0.0582
ma4=0.6089/0.1395
ar4
## [1] 0.7405498
ma4
## [1] 4.364875
fit3=Arima(train,order = c(4,2,3))
fit3
## Series: train 
## ARIMA(4,2,3) 
## 
## Coefficients:
##          ar1      ar2     ar3     ar4     ma1     ma2      ma3
##       0.1574  -0.8454  0.1250  0.1581  -0.972  1.0286  -0.9402
## s.e.  0.0478   0.0461  0.0476  0.0463   0.023  0.0174   0.0222
## 
## sigma^2 = 26.49:  log likelihood = -1812.4
## AIC=3640.8   AICc=3641.05   BIC=3675.88
ar4=0.1581/0.0463
ma3=abs(-0.9402/0.0222)
ar4
## [1] 3.414687
ma3
## [1] 42.35135

Since the parameters are significant, fit3 can be used.

fit4=Arima(train,order = c(3,2,4))
fit4
## Series: train 
## ARIMA(3,2,4) 
## 
## Coefficients:
##          ar1     ar2     ar3      ma1     ma2      ma3     ma4
##       0.2321  0.0154  0.4459  -1.0472  0.2096  -0.5496  0.4042
## s.e.  0.6129  0.4715  0.2347   0.5938  0.8854   0.5089  0.1803
## 
## sigma^2 = 26.96:  log likelihood = -1815.59
## AIC=3647.18   AICc=3647.43   BIC=3682.26
ar3=0.4459/0.2347
ma4=0.4042/0.1803
ar3
## [1] 1.899872
ma4
## [1] 2.241819
fit5=Arima(train,order = c(3,2,3))
fit5
## Series: train 
## ARIMA(3,2,3) 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1     ma2     ma3
##       0.5009  0.3118  -0.0289  -1.3234  0.1270  0.2091
## s.e.  0.4497  0.3236   0.0635   0.4479  0.6814  0.2606
## 
## sigma^2 = 27.03:  log likelihood = -1816.86
## AIC=3647.72   AICc=3647.91   BIC=3678.42
ar3=abs(-0.0289/0.0635)
ma3=0.2091/0.2606
ar3
## [1] 0.4551181
ma3
## [1] 0.8023791
fit6=Arima(train,order = c(2,2,3))
fit6
## Series: train 
## ARIMA(2,2,3) 
## 
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs üretimi
##          ar1     ar2      ma1      ma2     ma3
##       0.0777  0.6394  -0.8882  -0.5889  0.4933
## s.e.     NaN     NaN      NaN      NaN     NaN
## 
## sigma^2 = 27.03:  log likelihood = -1817.37
## AIC=3646.74   AICc=3646.88   BIC=3673.05
fit7=Arima(train,order = c(2,2,2))
fit7
## Series: train 
## ARIMA(2,2,2) 
## 
## Coefficients:
##          ar1     ar2      ma1     ma2
##       0.7691  0.0390  -1.5921  0.6035
## s.e.  0.1523  0.0584   0.1478  0.1388
## 
## sigma^2 = 26.97:  log likelihood = -1817.17
## AIC=3644.35   AICc=3644.45   BIC=3666.28
ar2=0.0390/0.0584
ma2=0.6035/0.1388
ar2
## [1] 0.6678082
ma2
## [1] 4.347983
fit7=Arima(train,order = c(2,2,1))
fit7
## Series: train 
## ARIMA(2,2,1) 
## 
## Coefficients:
##          ar1     ar2      ma1
##       0.1252  0.1297  -0.9370
## s.e.  0.0474  0.0466   0.0212
## 
## sigma^2 = 27.07:  log likelihood = -1818.72
## AIC=3645.44   AICc=3645.51   BIC=3662.98
ar4=0.0929/0.0503
ma1=abs(-0.9512/0.0262)
ar4
## [1] 1.846918
ma1
## [1] 36.30534
fit8=Arima(train,order = c(2,2,4))
fit8
## Series: train 
## ARIMA(2,2,4) 
## 
## Coefficients:
##           ar1      ar2     ma1      ma2      ma3      ma4
##       -1.2556  -0.4219  0.4488  -0.5833  -0.4429  -0.1575
## s.e.   0.2831   0.2760  0.2803   0.1059   0.2385   0.0426
## 
## sigma^2 = 27.08:  log likelihood = -1817.34
## AIC=3648.68   AICc=3648.87   BIC=3679.37
ma4=abs(-0.1575/0.0426)
ar2=abs(-0.4219/0.2760)
ar2
## [1] 1.528623
ma4
## [1] 3.697183

Since fit3 has lowest AIC, we choose it firstly.

fit9=Arima(train,order = c(4,1,5))
fit9
## Series: train 
## ARIMA(4,1,5) 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4      ma1     ma2      ma3     ma4
##       1.3234  -0.9494  1.3522  -0.7272  -1.1291  0.9044  -1.3126  0.6042
## s.e.  0.1336   0.0812  0.0775   0.1268   0.1400  0.0767   0.0837  0.1243
##           ma5
##       -0.0206
## s.e.   0.0458
## 
## sigma^2 = 26.82:  log likelihood = -1817.26
## AIC=3654.52   AICc=3654.9   BIC=3698.39
ma4=abs(-0.7272/0.1268)
ar2=abs(-0.0206/0.0458)
ar2
## [1] 0.4497817
ma4
## [1] 5.735016
fit10=Arima(train,order = c(1,1,5))
fit10
## Series: train 
## ARIMA(1,1,5) 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3     ma4      ma5
##       0.9989  -0.8178  0.0278  -0.1118  0.0234  -0.0292
## s.e.  0.0015   0.0431  0.0552   0.0598  0.0588   0.0449
## 
## sigma^2 = 27.23:  log likelihood = -1822.66
## AIC=3659.32   AICc=3659.51   BIC=3690.02
ma4=abs(0.9989/0.0015)
ar2=abs(-0.0292/0.0449)
ar2
## [1] 0.6503341
ma4
## [1] 665.9333

fit1=ARIMA(2,2,1) and fit3=ARIMA(4,2,3)

fit3
## Series: train 
## ARIMA(4,2,3) 
## 
## Coefficients:
##          ar1      ar2     ar3     ar4     ma1     ma2      ma3
##       0.1574  -0.8454  0.1250  0.1581  -0.972  1.0286  -0.9402
## s.e.  0.0478   0.0461  0.0476  0.0463   0.023  0.0174   0.0222
## 
## sigma^2 = 26.49:  log likelihood = -1812.4
## AIC=3640.8   AICc=3641.05   BIC=3675.88
r=resid(fit3)/sd(residuals(fit3))
head(r)
## Time Series:
## Start = 19048 
## End = 19053 
## Frequency = 1 
## [1]  0.01277423 -0.03832256  2.58245119  1.49520387  0.76980521 -0.11041227
autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")

Residuals are scattered around zero and it can be interpreted as zero mean

ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

QQ Plot shows that most of the residuals of the model does not lie on 45 degree straight line. This indicates residuals are not normally distributed.

ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

Histogram of the resiudals shows that they do not have a symmetric distribution.

checkresiduals(fit3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,2,3)
## Q* = 17.197, df = 3, p-value = 0.0006437
## 
## Model df: 7.   Total lags used: 10

Since p-value is less than 0.05, The model exhibit lack of fit.

g1<-ggAcf(as.vector(r),lag.max = 72)+theme_minimal()+ggtitle("ACF of  Residuals")
g2<-ggPacf(as.vector(r),lag.max = 72)+theme_minimal()+ggtitle("PACF of  Residuals")  # homoscedasticity check
grid.arrange(g1,g2,ncol=2)

There exist significant spikes in the plots.

shapiro.test(r)
## 
##  Shapiro-Wilk normality test
## 
## data:  r
## W = 0.91643, p-value < 2.2e-16

/ Now, we cannot continue our analysis with ARIMA(4,2,3). Maybe we can continue fit1 which is ARIMA(2,2,1). Let us check dignostics of ARIMA(2,2,1): /

fit1
## Series: train 
## ARIMA(2,2,1) 
## 
## Coefficients:
##          ar1     ar2      ma1
##       0.1252  0.1297  -0.9370
## s.e.  0.0474  0.0466   0.0212
## 
## sigma^2 = 27.07:  log likelihood = -1818.72
## AIC=3645.44   AICc=3645.51   BIC=3662.98
r2=resid(fit1)/sd(residuals(fit1))

ggplot(r2, aes(sample = r2)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

QQ Plot shows that most of the residuals of the model does not lie on 45 degree straight line. This indicates residuals are not normally distributed.

autoplot(r2)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")

Residuals are scattered around zero and it can be interpreted as zero mean

ggplot(r2, aes(sample = r2)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

QQ Plot shows that most of the residuals of the model does not lie on 45 degree straight line. This indicates residuals are not normally distributed.

ggplot(r2,aes(x=r2))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

Histogram of the resiudals shows that they do not have a symmetric distribution.

checkresiduals(fit1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,2,1)
## Q* = 12.457, df = 7, p-value = 0.08649
## 
## Model df: 3.   Total lags used: 10

Since p-value is less than 0.05, The model exhibit lack of fit.

g1<-ggAcf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("ACF of  Residuals")
g2<-ggPacf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("PACF of  Residuals")  # homoscedasticity check
grid.arrange(g1,g2,ncol=2)

There exist significant spikes in the plots.

g1<-ggAcf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(r2),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals")  # homoscedasticity check
grid.arrange(g1,g2,ncol=2)

There exist significant spikes in the plots.

shapiro.test(r2)
## 
##  Shapiro-Wilk normality test
## 
## data:  r2
## W = 0.90164, p-value < 2.2e-16

/ Since p value is less than 0.05, we reject null hypothesis which is that the residuals has normality. /

m=lm(r~1+zlag(r))
library(lmtest)


bgtest(m, order = 15)
## 
##  Breusch-Godfrey test for serial correlation of order up to 15
## 
## data:  m
## LM test = 26.156, df = 15, p-value = 0.03641

Since p value is less than alpha, the residuals of the model are correlated, according to results of Breusch-Godfrey Test.

m2=lm(r2~1+zlag(r2))


bgtest(m2, order = 15)
## 
##  Breusch-Godfrey test for serial correlation of order up to 15
## 
## data:  m2
## LM test = 25.8, df = 15, p-value = 0.04018

Since p value is less than alpha, the residuals of the model are correlated, according to results of Breusch-Godfrey Test.

rr=r^2
g1<-ggAcf(as.vector(rr))+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(rr),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals")  # homoscedasticity check
grid.arrange(g1,g2,ncol=2)

library(FinTS)
## 
## Attaching package: 'FinTS'
## The following object is masked from 'package:forecast':
## 
##     Acf
ArchTest(r)
## 
##  ARCH LM-test; Null hypothesis: no ARCH effects
## 
## data:  r
## Chi-squared = 17.147, df = 12, p-value = 0.1441

There exist significant spikes in plots so there exist heteroscedasticity problem. Also, since p value is greater than 0.05, residuals exhibit no ARCH effects.

rr2=r2^2
g1<-ggAcf(as.vector(rr2),lag.max = 72)+theme_minimal()+ggtitle("ACF of Squared Residuals")
g2<-ggPacf(as.vector(rr2),lag.max = 72)+theme_minimal()+ggtitle("PACF of Squared Residuals")  # homoscedasticity check
grid.arrange(g1,g2,ncol=2)

library(FinTS)
ArchTest(r2)
## 
##  ARCH LM-test; Null hypothesis: no ARCH effects
## 
## data:  r2
## Chi-squared = 13.382, df = 12, p-value = 0.3419

There exist significant spikes in plots so there exist heteroscedasticity problem. Also, since p value is greater than 0.05, residuals exhibit no ARCH effects.

Now, we can continue our analysis with fit1 which is ARIMA(2,2,1) since it does not exhibit lack of fit.

set.seed(497)
test=read.zoo(test)
f=forecast(fit1,h=length(test))
summary(f)
## 
## Forecast method: ARIMA(2,2,1)
## 
## Model Information:
## Series: train 
## ARIMA(2,2,1) 
## 
## Coefficients:
##          ar1     ar2      ma1
##       0.1252  0.1297  -0.9370
## s.e.  0.0474  0.0466   0.0212
## 
## sigma^2 = 27.07:  log likelihood = -1818.72
## AIC=3645.44   AICc=3645.51   BIC=3662.98
## 
## Error measures:
##                       ME     RMSE      MAE       MPE      MAPE     MASE
## Training set -0.05560763 5.180923 3.655655 0.0164543 0.2503019 0.463294
##                     ACF1
## Training set 0.005892095
## 
## Forecasts:
##       Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 19643       4835.454 4828.787 4842.122 4825.257 4845.652
## 19644       4842.717 4832.362 4853.072 4826.880 4858.553
## 19645       4850.917 4836.955 4864.878 4829.565 4872.269
## 19646       4859.339 4842.091 4876.587 4832.960 4885.718
## 19647       4867.911 4847.544 4888.277 4836.762 4899.059
## 19648       4876.530 4853.171 4899.889 4840.805 4912.254
## 19649       4885.174 4858.901 4911.447 4844.993 4925.355
## 19650       4893.828 4864.691 4922.965 4849.266 4938.390
## 19651       4902.486 4870.512 4934.461 4853.586 4951.387
## 19652       4911.146 4876.348 4945.945 4857.927 4964.366
## 19653       4919.807 4882.186 4957.428 4862.271 4977.344
## 19654       4928.468 4888.018 4968.919 4866.605 4990.332
## 19655       4937.130 4893.838 4980.421 4870.921 5003.339
## 19656       4945.791 4899.640 4991.941 4875.210 5016.372
## 19657       4954.452 4905.423 5003.482 4879.468 5029.437
## 19658       4963.114 4911.181 5015.046 4883.690 5042.537
## 19659       4971.775 4916.915 5026.635 4887.874 5055.676
## 19660       4980.437 4922.621 5038.252 4892.016 5068.857
## 19661       4989.098 4928.300 5049.896 4896.115 5082.081
## 19662       4997.759 4933.949 5061.570 4900.169 5095.350
## 19663       5006.421 4939.567 5073.274 4904.177 5108.664
## 19664       5015.082 4945.155 5085.009 4908.138 5122.026
## 19665       5023.744 4950.712 5096.775 4912.051 5135.436
## 19666       5032.405 4956.237 5108.573 4915.916 5148.894
## 19667       5041.066 4961.730 5120.402 4919.733 5162.400
## 19668       5049.728 4967.192 5132.264 4923.500 5175.956
## 19669       5058.389 4972.621 5144.157 4927.218 5189.560
## 19670       5067.051 4978.018 5156.083 4930.887 5203.214
## 19671       5075.712 4983.382 5168.041 4934.506 5216.918
## 19672       5084.373 4988.715 5180.032 4938.076 5230.670
## 19673       5093.035 4994.015 5192.054 4941.597 5244.472
ets=ets(train,model = "ZZZ")
etsforc=forecast(ets,h=length(test))
summary(ets)
## ETS(A,A,N) 
## 
## Call:
##  ets(y = train, model = "ZZZ") 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.1456 
## 
##   Initial states:
##     l = 124.1091 
##     b = 13.1785 
## 
##   sigma:  5.2568
## 
##      AIC     AICc      BIC 
## 5782.002 5782.104 5803.945 
## 
## Training set error measures:
##                      ME     RMSE      MAE          MPE      MAPE      MASE
## Training set -0.0549773 5.239054 3.688071 -0.001526009 0.2651306 0.4674022
##                    ACF1
## Training set 0.03969414
r=etsforc$residuals

ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")

ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

shapiro.test(r)
## 
##  Shapiro-Wilk normality test
## 
## data:  r
## W = 0.91062, p-value < 2.2e-16
nnetar.m=nnetar(train)
nnetar_forecast<-forecast(nnetar.m,h=length(test))
nnetar.m$method
## [1] "NNAR(1,1)"
nnetar.m$call
## nnetar(y = train)
nnetar.m$model
## 
## Average of 20 networks, each of which is
## a 1-1-1 network with 4 weights
## options were - linear output units
r=nnetar.m$residuals

ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Warning: Removed 1 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 1 rows containing non-finite values (`stat_qq_line()`).

autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")
## Warning: Removed 1 row containing missing values (`geom_line()`).

ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 1 rows containing non-finite values (`stat_density()`).

shapiro.test(r)
## 
##  Shapiro-Wilk normality test
## 
## data:  r
## W = 0.97479, p-value = 1.391e-08
tbatsmodel<-tbats(ts(train))
tbats_forecast<-forecast(tbatsmodel,h=length(test))
tbatsmodel
## BATS(1, {0,0}, 0.995, -)
## 
## Call: tbats(y = ts(train))
## 
## Parameters
##   Alpha: 1.051277
##   Beta: 0.122806
##   Damping Parameter: 0.994958
## 
## Seed States:
##           [,1]
## [1,] 125.04875
## [2,]  15.85002
## 
## Sigma: 5.21842
## AIC: 5777.306
r=ts(tbats_forecast$residuals)

ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")

ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

shapiro.test(r)
## 
##  Shapiro-Wilk normality test
## 
## data:  r
## W = 0.90448, p-value < 2.2e-16
changepoint_prior_scale=c(0.001, 0.01, 0.1, 0.5)
changepoint_range=c(0.8,0.85,0.90, 0.95)
colnames(train2)=c("ds","y")
for(i in changepoint_prior_scale){
  for(j in changepoint_range){
    prophet.m=prophet(train2,daily.seasonality = FALSE,yearly.seasonality = FALSE,changepoint.prior.scale =i ,changepoint.range =j   )
    future=make_future_dataframe(prophet.m, periods = 31)
    prophet_forecast=predict(prophet.m,future)
    print(i)
    print(j)
    k=data.frame(accuracy(prophet_forecast$yhat,test))
    rownames(k)=paste(as.character(i),as.character(j),sep = "-")
    print(k)
    
}}
## [1] 0.001
## [1] 0.8
##                 ME     RMSE      MAE      MPE     MAPE
## 0.001-0.8 4534.172 4534.569 4534.172 88.00939 88.00939
## [1] 0.001
## [1] 0.85
##                  ME     RMSE      MAE      MPE     MAPE
## 0.001-0.85 4557.406 4557.785 4557.406 88.46109 88.46109
## [1] 0.001
## [1] 0.9
##                 ME     RMSE      MAE      MPE     MAPE
## 0.001-0.9 4531.847 4532.249 4531.847 87.96403 87.96403
## [1] 0.001
## [1] 0.95
##                 ME     RMSE     MAE      MPE     MAPE
## 0.001-0.95 4595.92 4596.266 4595.92 89.21016 89.21016
## [1] 0.01
## [1] 0.8
##                ME     RMSE      MAE      MPE     MAPE
## 0.01-0.8 4794.007 4794.066 4794.007 93.07424 93.07424
## [1] 0.01
## [1] 0.85
##                 ME     RMSE      MAE      MPE     MAPE
## 0.01-0.85 4794.718 4794.776 4794.718 93.08809 93.08809
## [1] 0.01
## [1] 0.9
##                ME     RMSE      MAE      MPE     MAPE
## 0.01-0.9 4795.329 4795.371 4795.329 93.10193 93.10193
## [1] 0.01
## [1] 0.95
##                 ME     RMSE      MAE      MPE     MAPE
## 0.01-0.95 4794.353 4794.397 4794.353 93.08295 93.08295
## [1] 0.1
## [1] 0.8
##              ME     RMSE     MAE      MPE     MAPE
## 0.1-0.8 4792.21 4792.223 4792.21 93.04694 93.04694
## [1] 0.1
## [1] 0.85
##                ME     RMSE      MAE      MPE     MAPE
## 0.1-0.85 4792.386 4792.397 4792.386 93.05119 93.05119
## [1] 0.1
## [1] 0.9
##               ME     RMSE      MAE      MPE     MAPE
## 0.1-0.9 4791.264 4791.274 4791.264 93.02981 93.02981
## [1] 0.1
## [1] 0.95
##                ME     RMSE      MAE      MPE     MAPE
## 0.1-0.95 4790.621 4790.633 4790.621 93.01801 93.01801
## [1] 0.5
## [1] 0.8
##               ME     RMSE      MAE      MPE     MAPE
## 0.5-0.8 4792.126 4792.139 4792.126 93.04606 93.04606
## [1] 0.5
## [1] 0.85
##                ME     RMSE      MAE      MPE     MAPE
## 0.5-0.85 4791.652 4791.663 4791.652 93.03733 93.03733
## [1] 0.5
## [1] 0.9
##               ME     RMSE      MAE      MPE     MAPE
## 0.5-0.9 4790.899 4790.908 4790.899 93.02324 93.02324
## [1] 0.5
## [1] 0.95
##               ME     RMSE     MAE      MPE     MAPE
## 0.5-0.95 4789.96 4789.972 4789.96 93.00599 93.00599
prophet.m=prophet(train2,daily.seasonality = FALSE,yearly.seasonality = FALSE,changepoint.prior.scale =0.001,changepoint.range =0.9   )
future=make_future_dataframe(prophet.m, periods = length(test))
prophet_forecast=predict(prophet.m,future)
prophet.m$fit.kwargs
## list()
r=ts(prophet_forecast$yhat-data$tank)
checkresiduals(r)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 5141.7, df = 10, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 10

Since p-value is greater than 0.05, The model does not exhibit lack of fit.

ggplot(r, aes(sample = r)) +stat_qq()+geom_qq_line()+ggtitle("QQ Plot of the Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

autoplot(r)+geom_line(y=0)+theme_minimal()+ggtitle("Plot of The Residuals")

ggplot(r,aes(x=r))+geom_histogram(bins=20)+geom_density()+ggtitle("Histogram of Residuals")+theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

shapiro.test(r)
## 
##  Shapiro-Wilk normality test
## 
## data:  r
## W = 0.98078, p-value = 2.523e-07
print("ARIMA(2,2,1)")
## [1] "ARIMA(2,2,1)"
accuracy(f,test)
##                        ME       RMSE        MAE       MPE      MAPE      MASE
## Training set  -0.05560763   5.180923   3.655655 0.0164543 0.2503019  0.463294
## Test set     190.34392895 198.410056 190.343929 3.6681379 3.6681379 24.122956
##                     ACF1
## Training set 0.005892095
## Test set              NA
print("ETS")
## [1] "ETS"
accuracy(etsforc,test)
##                       ME       RMSE        MAE          MPE      MAPE
## Training set  -0.0549773   5.239054   3.688071 -0.001526009 0.2651306
## Test set     189.9388230 198.572654 189.938823  3.659281250 3.6592812
##                    MASE       ACF1
## Training set  0.4674022 0.03969414
## Test set     24.0716153         NA
print("NNETAR")
## [1] "NNETAR"
accuracy(nnetar_forecast,test)
##                       ME       RMSE        MAE        MPE     MAPE      MASE
## Training set   0.2493625   9.524709   7.468734 -0.2165987 0.585039  0.946539
## Test set     480.2073608 520.639393 480.207361  9.2234421 9.223442 60.858368
##                   ACF1
## Training set 0.7388282
## Test set            NA
print("TBATS")
## [1] "TBATS"
accuracy(tbats_forecast,test)
##                      ME      RMSE        MAE         MPE      MAPE       MASE
## Training set   0.213985   5.21842   3.609074 0.002092469 0.2553715  0.4573906
## Test set     198.887362 208.98148 198.887362 3.829788413 3.8297884 25.2056952
##                     ACF1
## Training set 0.003401719
## Test set              NA
print("Prophet")
## [1] "Prophet"
accuracy(prophet_forecast$yhat,test)
##                ME     RMSE      MAE      MPE     MAPE
## Test set 4531.847 4532.249 4531.847 87.96403 87.96403

The best model is Holt’s method and the worst model is Prophet.

data_new=data


Series=ts(data_new$tank,start = 19048,end =19547 )
autoplot(f,series="ARIMA",color="blue")+autolayer(Series)+autolayer(fitted(fit1),color = "red", series="Fitted",linetype="dashed") + geom_vline(xintercept =19643    )+
  ggplot2::ggtitle("ARIMA")+theme_minimal()+theme(legend.position="bottom")

plot(etsforc)

autoplot(etsforc,series="ETS",color="red")+autolayer(Series)+autolayer(fitted(etsforc),series="Fitted",linetype="dashed") + geom_vline(xintercept =19643     )+
  ggplot2::ggtitle("ETS")+theme_minimal()+theme(legend.position="bottom")

plot(nnetar_forecast)

autoplot(Series,main="nnetar_forecast")+autolayer(nnetar_forecast) +autolayer(fitted(nnetar_forecast), series="Fitted",linetype="dashed")+ geom_vline(xintercept =19643  )+
  ggplot2::ggtitle("NNETAR") +theme_minimal()+theme(legend.position="bottom")
## Warning: Removed 1 row containing missing values (`geom_line()`).

Series2=ts(data_new$tank,start =1 )
plot(tbats_forecast)

autoplot(Series2,series="data",color="red")+autolayer(tbats_forecast,series="Forecast")+autolayer(fitted(tbats_forecast),series="Fitted",linetype="dashed") + geom_vline(xintercept =595     )+
  ggplot2::ggtitle("TBATS")+theme_minimal()+theme(legend.position="bottom")

plot(prophet.m,prophet_forecast)